import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
import plotly.graph_objects as go
import datetime
This project firstly is to create filters applying to each row from assessor records:
Filtering out invested properties, which includes the following factors:
Filtering out Flipping properties, using conditions:
After having those sepcial data, checking if a sold property is On or Off market, using:
#Assesor_record: 1st American: On & OFF
#Only read some certain columns from datasets that will be used for this analysis
assessor_records = pd.read_csv(
'/Users/ngothixuan/Desktop/On-Off-Project1/Data/ds_assessor_records.csv',
usecols = ["propertyid" , "situscity",
"situsfullstreetaddress", "mailingfullstreetaddress",
'ownername1full',
'currentsalesprice','prevsalesprice',
'currentsalerecordingdate', 'prevsalerecordingdate',
'situscensustract', 'situscensusblock', 'situslatitude','situslongitude', 'yearbuilt',
"storiesnbrcode"
])
#1st American: listing, ON market
american_list = pd.read_csv("/Users/ngothixuan/Desktop/On-Off-Project1/Data/ds_listings.csv",
usecols =["propertyid","situscity",
'situsfullstreetaddress',
"propertytype",
"status","solddate"])
#MLSes: listing
chicago_list = pd.read_csv("/Users/ngothixuan/Desktop/On-Off-Project1/Data/red_chicago_property_listings.csv",
usecols =["county_data_id","city",
"property_key", 'property_type','unit_count',
"listed_on", "status", "sold_on"] )
dallas_list = pd.read_csv("/Users/ngothixuan/Desktop/On-Off-Project1/Data/red_dallas_property_listings.csv",
usecols =["county_data_id", "city",
"property_key", 'property_type','unit_count',
"listed_on", "status", "sold_on"])
denver_list = pd.read_csv("/Users/ngothixuan/Desktop/On-Off-Project1/Data/red_denver_property_listings.csv",
usecols =["county_data_id","city",
"property_key", 'property_type','unit_count',
"listed_on", "status", "sold_on"])
def clean_listing_data(df, id_colname, city_colname, city_name ):
''' ----- This function is to clean MLS data and 1st American Listing forms ------
INPUT:
- df: Dataframe, for example: chicago_list/american_list
- id_colname: the ID column name such as: "county_data_id" for MLSers, "propertyid" for assessor_records
- city_colname: name of the column showes cities
- city_name: name of a city in form of string, such as, 'denver', which to choose only a specific city
OUTPUT:
- df_one_city: return a dataframe with non missing values and non negative Id column, only sold properties,
and only those rows matched with chosen city
'''
#Drop rows that have NA from county_data_id column
df_dropNA = df[df[id_colname].notna()]
#Convert dtype of county_data_id column to np.int64
df_dropNA.loc[:,id_colname] = df_dropNA.loc[:,id_colname].astype(np.int64)
# choose only those ID positive values
df_ID = df_dropNA[df_dropNA[id_colname] >0]
#choose sold
df_sold = df_ID[df_ID['status'].str.lower() =="sold"]
# clean City column
drop_na = df_sold[df_sold[city_colname].notna()] #keep only non nan values
df_clean_city = drop_na[drop_na[city_colname].str.isnumeric() != True] #keep only non numeric values
df_clean_city[city_colname] = df_clean_city[city_colname].str.lower() #conver to lower case
#Choose a specific city
df_one_city = df_clean_city[df_clean_city[city_colname] == city_name]
return df_one_city
def clean_assessor_record(df, city_name):
''' ----- This function is to clean Assessor Record datasets ------
INPUT:
- df: Dataframe, for example: assessor_records
- city_name: name of a city in form of string, such as, 'denver', which to choose only a specific city
OUTPUT:
- df_one_city: return a dataframe with non missing values and non negative Id column,
only sold properties (those have either "currentsalerecordingdate or" "prevsalerecordingdate"),
and only those rows matched with chosen city
'''
#Drop rows that have NA from county_data_id column
df_dropNA = df[df["propertyid"].notna()]
#Convert dtype of county_data_id column to np.int64
df_dropNA.loc[:,"propertyid"] = df_dropNA.loc[:,"propertyid"].astype(np.int64)
# choose only those ID positive values
df_ID = df_dropNA[df_dropNA["propertyid"] >0]
#sold
df_sold = df_ID.dropna(subset=["currentsalerecordingdate","prevsalerecordingdate"], how = "all")
#clean City column
drop_na = df_sold[df_sold['situscity'].notna()] #keep only non nan values
df_clean_city = drop_na[drop_na['situscity'].str.isnumeric() != True] #keep only non numeric values
df_clean_city['situscity'] = df_clean_city['situscity'].str.lower() #conver to lower case
#Choose a specific city
df_one_city = df_clean_city[df_clean_city['situscity'] == city_name]
return df_one_city
Comparing Address - Count Owner's Apperance - Profit Percent Filter
# compare address
# count ownername and keep those > 0
# profit > 0
# 10 =< %profit < 1000%
def invested_properties(df):
"""
------ This function is used to identify invested properties form Assessor_records -------
INPUT:
- df: data you want to compare address, for example: assessor_records
OUTPUT:
- different_address_df: dataframe contains rows that have address and mailling address are different,
ownername occur > 0 times, profit >0, percent of profit in range (10%, 1000%)
"""
selected_cols = ['propertyid', 'situsfullstreetaddress', 'situscity',
'ownername1full', 'mailingfullstreetaddress', 'yearbuilt',
'storiesnbrcode', 'currentsalerecordingdate', 'currentsalesprice',
'prevsalerecordingdate', 'prevsalesprice'] #only these columns needed for this function
sub_df = df[selected_cols]
sub_df = sub_df.drop_duplicates(keep = 'first')
sub_df = sub_df.dropna(subset = ['situsfullstreetaddress', 'mailingfullstreetaddress',
'ownername1full','currentsalesprice','prevsalesprice'], how= 'any')
#comapre
match_address_df = sub_df[sub_df['situsfullstreetaddress'
].str.lower().isin(sub_df['mailingfullstreetaddress'].str.lower())]
#keep different addresses only
different_address_df = sub_df.drop(index = match_address_df.index.values) #keep different
# count ownername and keep those >= threshold
invested_owners = different_address_df.groupby('ownername1full', as_index=False).filter(lambda x: len(x) > 0) #choose your threshold
#those properties that brought 10%<= profit<1000%
invested_owners['profit'] = invested_owners.currentsalesprice - invested_owners.prevsalesprice
invested_owners = invested_owners[invested_owners.profit > 0]
invested_owners['profit_percentage'] = round((invested_owners.profit / invested_owners.prevsalesprice) * 100, 2)
address_owners_profit = invested_owners[(invested_owners.profit_percentage >= 10) &
(invested_owners.profit_percentage < 1000)]
return address_owners_profit
def identify_flip(df):
""" --- This function is to filter out those flip properties which have time_between_sale <= 2 and 25% =< profit% < 1000%---
INPUT:
- df: assessor records dataframe
OUTPUT:
- flip: a dataframe with potential flip properties
"""
#Select special columns
selected_cols = ['propertyid', 'situsfullstreetaddress', 'situscity',
'ownername1full', 'mailingfullstreetaddress', 'yearbuilt',
'storiesnbrcode', 'currentsalerecordingdate', 'currentsalesprice',
'prevsalerecordingdate', 'prevsalesprice']
df_flip = df[selected_cols]
# Drop duplicates data, Drop NA
df_flip = df.drop_duplicates(keep='first')
df_flip = df_flip.dropna(subset=['yearbuilt','currentsalesprice',
'prevsalesprice','currentsalerecordingdate',
'prevsalerecordingdate'], how = "any")
# Change data format to datetime
df_flip['currentsalerecordingdateCOPY'] = df_flip['currentsalerecordingdate'] #use copy col only
df_flip.currentsalerecordingdateCOPY = pd.to_datetime(df_flip.currentsalerecordingdateCOPY.astype(str), format='%Y%m%d')
df_flip.currentsalerecordingdateCOPY = pd.to_datetime(df_flip.currentsalerecordingdateCOPY, errors='coerce').dt.strftime('%Y')
df_flip['prevsalerecordingdateCOPY'] = df_flip['prevsalerecordingdate']
df_flip.prevsalerecordingdateCOPY = pd.to_datetime(df_flip.prevsalerecordingdateCOPY.astype(str), format='%Y%m%d')
df_flip.prevsalerecordingdateCOPY = pd.to_datetime(df_flip.prevsalerecordingdateCOPY, errors='coerce').dt.strftime('%Y')
# Change data types to numeric
df_flip["currentsalerecordingdateCOPY"] = pd.to_numeric(df_flip["currentsalerecordingdateCOPY"])
df_flip["prevsalerecordingdateCOPY"] = pd.to_numeric(df_flip["prevsalerecordingdateCOPY"])
# Create cols to identify flip properties
df_flip['time_between_sale'] = df_flip.currentsalerecordingdateCOPY - df_flip.prevsalerecordingdateCOPY
df_flip['profit'] = df_flip.currentsalesprice - df_flip.prevsalesprice
df_flip =df_flip[df_flip.profit > 0]
df_flip['profit_percentage'] = round((df_flip.profit / df_flip.prevsalesprice) * 100, 2)
# Set parameters for flip properties
flip = df_flip[df_flip.time_between_sale <= 2]
flip = flip[(flip['profit_percentage'] >= 25) & (flip['profit_percentage'] <1000)]# Adjust profit percentage to more reasonable number
return flip
def Compare_2_dataframe(df1, df2, df1_id, df2_id):
"""
------- This function is used for datasets after using cleaning and filter functions above --------
Input:
- df1: None Missing values dataframe, ex: data from MLS listing
- df2: None Missing values dataframe that is compared with df1, ex: assessor records
- df1_id: ID column of df1, ex: county_data_id column
- df2_id: ID column of df2, ex: propertyid column
Output:
- match_df1_df2: dataframe includes those properties that exist in both dataframes, df1, df2.
- only_df1: those properties only exist in df1
- only_df2: those properties only exist in df2
"""
#Using MERGE
df_compare = df1.merge(df2, left_on = df1_id, right_on = df2_id,
how = "outer", indicator = True)
# Assign match and not match data
match_df1_df2 = df_compare.query('_merge == "both"')
only_df1 = df_compare.query('_merge == "left_only" ')
only_df2 = df_compare.query('_merge == "right_only" ')
return match_df1_df2, only_df1, only_df2
#check old
def On_Off_Market(city_name, mls, assessor_df, firstamerican_list):
cleaned_assessor = clean_assessor_record(assessor_df, city_name)
cleaned_mls = clean_listing_data(mls, "county_data_id", "city", city_name)
cleaned_amricanlist = clean_listing_data(firstamerican_list,"propertyid","situscity", city_name)
#match1: assessor vs. mls
assessor_mls = Compare_2_dataframe(cleaned_mls, cleaned_assessor,
'county_data_id','propertyid')
#take matched rows => On market values
on_matched_mls = assessor_mls[0][["county_data_id","city",
"property_key", "status", "sold_on", ]] #only want these columns
on_MLS = on_matched_mls.drop_duplicates(subset=["county_data_id", "property_key",
"sold_on"], keep='first').sort_values("county_data_id")#drop duplicated rows after matching
#take "right_only" rows from assessor record
not_matched_assessor = assessor_mls[2]
not_matched_assessor_subset = not_matched_assessor[["propertyid" , "situscity", 'ownername1full',
"situsfullstreetaddress","currentsalerecordingdate",
"prevsalerecordingdate", "property_key"]] #select original columns belongs to assessor record
#------------------------ON First American Listing-----------------------------------------------
#match2: american listing Vs. not_matched_assessor
assessor_americanlist = Compare_2_dataframe(cleaned_amricanlist, not_matched_assessor_subset ,
'propertyid','propertyid')
# take matched rows => On market values
on_matched_americanlist = assessor_americanlist[0][["propertyid","situscity_y",
'situsfullstreetaddress_y',
"status","solddate"]] #keep original columns of first American listing
on_americanlist = on_matched_americanlist.drop_duplicates(subset=[
"propertyid","solddate"], keep='first').sort_values("propertyid")
#-------------------ON market that assessor record matched MLS and American listing ----
on_americanlist.columns = on_MLS.columns.to_list()
on_combine_MLS_American = on_MLS.append(on_americanlist)
#drop those duplicate of property_key and sold_on (count month and year only)
on_combine_MLS_American.sold_on = pd.to_datetime(on_combine_MLS_American.sold_on, errors='coerce').dt.strftime('%Y-%m')
on_MLS_American = on_combine_MLS_American.drop_duplicates(["property_key","sold_on"]).sort_values("county_data_id")
#------------------------OFF market in Assessor Records-------------------------------------------
#take not matched from match2, the left => Off market values (never been listed)
off_market = assessor_americanlist[2][["propertyid" , 'ownername1full','situscity_y', 'situsfullstreetaddress_y',
"currentsalerecordingdate", "prevsalerecordingdate"]]
off_market.columns = ["propertyid" ,'ownername1full', "situscity", "situsfullstreetaddress",
"currentsalerecordingdate", "prevsalerecordingdate"]
#off current sold
off_current = off_market[["propertyid",'ownername1full',
"situscity",
"situsfullstreetaddress",
"currentsalerecordingdate"]]
#off previous sold
off_prev = off_market[["propertyid",'ownername1full',
"situscity",
"situsfullstreetaddress",
"prevsalerecordingdate"]]
#rename and combine them together => Off or previous and current sold
off_current.rename(columns={'currentsalerecordingdate': 'solddate'}, inplace=True)
off_prev.rename(columns={'prevsalerecordingdate': 'solddate'}, inplace=True)
off_current_and_prev = off_current.append(off_prev).dropna().sort_values("propertyid")
off_current_and_prev['solddate'] = pd.to_datetime(off_current_and_prev['solddate'].astype(str), format='%Y%m%d')
off_current_and_prev['solddate'] = pd.to_datetime(off_current_and_prev['solddate'], errors='coerce').dt.strftime('%Y-%m')
return on_MLS, on_americanlist, on_MLS_American , off_market, off_current_and_prev
def On_Off_Market(city_name, mls, assessor_df, firstamerican_list):
"""--- This function is to check whether a property was listed and sold On MLS/1st American Listing or Off market -----
INPUT:
- city_name: name of a city in form of string, as other functions
- mls: MLS dataset name, ex: chicago_list
- assessor_df: assessor record data
- firstamerican_list: first american listing data
OUTPUT:
- on_MLS: those assessor record rows that matched MLS lisiting
- on_americanlist: those rows do not matched MLS but 1st American listing
- on_MLS_American: combine on_MLS and on_americanlist
- off_market: those rows do not match any MLSers or 1st AMerican listing
- off_current_and_prev: count both transactions from previous sales and current sales as the number of off market deals
"""
cleaned_assessor = clean_assessor_record(assessor_df, city_name)
cleaned_mls = clean_listing_data(mls, "county_data_id", "city", city_name)
cleaned_amricanlist = clean_listing_data(firstamerican_list,"propertyid","situscity", city_name)
#match1: assessor vs. mls
assessor_mls = Compare_2_dataframe(cleaned_mls, cleaned_assessor,
'county_data_id','propertyid')
#------------------------ON MLS-----------------------------------------------
on_matched_mls = assessor_mls[0][["county_data_id","city",
"property_key", "status", "sold_on", ]] #only want these columns
on_MLS = on_matched_mls.drop_duplicates(subset=["county_data_id", "property_key",
"sold_on"], keep='first').sort_values("county_data_id") #drop duplicated rows after matching
#take "right_only" rows from assessor record
not_matched_assessor = assessor_mls[2]
not_matched_assessor_subset = not_matched_assessor[[
"propertyid" , "situscity",
"situsfullstreetaddress","currentsalerecordingdate",
"prevsalerecordingdate", "property_key"]] #select original columns belongs to assessor record
#------------------------ON First American Listing-----------------------------------------------
#match2: american listing Vs. not_matched_assessor
assessor_americanlist = Compare_2_dataframe(cleaned_amricanlist, not_matched_assessor_subset ,
'propertyid','propertyid')
# take matched rows => On market values
on_matched_americanlist = assessor_americanlist[0][["propertyid","situscity_y",
'situsfullstreetaddress_y',
"status","solddate"]] #keep original columns of first American listing
on_americanlist = on_matched_americanlist.drop_duplicates(subset=[
"propertyid","solddate"], keep='first').sort_values("propertyid")
#-------------------ON market that assessor record matched MLS and American listing ---------------
on_americanlist.columns = on_MLS.columns.to_list()
on_combine_MLS_American = on_MLS.append(on_americanlist)
#drop those duplicate of property_key and sold_on (count month and year only)
on_combine_MLS_American.sold_on = pd.to_datetime(on_combine_MLS_American.sold_on, errors='coerce').dt.strftime('%Y-%m')
on_MLS_American = on_combine_MLS_American.drop_duplicates(["property_key","sold_on"]).sort_values("county_data_id")
#------------------------OFF market in Assessor Records-------------------------------------------
#take not matched from match2, the left => Off market values (never been listed)
off_market = assessor_americanlist[2][["propertyid" ,'situscity_y',
'situsfullstreetaddress_y',"currentsalerecordingdate",
"prevsalerecordingdate"]]
off_market.columns = ["propertyid" ,"situscity",
"situsfullstreetaddress",
"currentsalerecordingdate",
"prevsalerecordingdate"]
#off current sold
off_current = off_market[["propertyid", "situscity",
"situsfullstreetaddress", "currentsalerecordingdate"]]
#off previous sold
off_prev = off_market[["propertyid","situscity",
"situsfullstreetaddress", "prevsalerecordingdate"]]
#rename and combine them together => Off or previous and current sold
off_current.rename(columns={'currentsalerecordingdate': 'solddate'}, inplace=True)
off_prev.rename(columns={'prevsalerecordingdate': 'solddate'}, inplace=True)
off_current_and_prev = off_current.append(off_prev).dropna().sort_values("propertyid")
off_current_and_prev['solddate'] = pd.to_datetime(off_current_and_prev['solddate'].astype(str), format='%Y%m%d')
off_current_and_prev['solddate'] = pd.to_datetime(off_current_and_prev['solddate'], errors='coerce').dt.strftime('%Y-%m')
return on_MLS, on_americanlist, on_MLS_American , off_market, off_current_and_prev
def show_invested_properties(city_name, mls, assessor_df, my1, my2):
"""
INPUT:
- city_name: name of a city
- mls: MLS dataset name
- assessor_df: a dataframe from On_OFF_market() function
- my1, my2: a year-month you want to see the analysis, , ex: "2021-01", "2021-12"
OUTPUT:
- fig1.show(): general picture of On Vs Off for Invested Properties that sold in a chosen city
"""
selected_cols = ['propertyid', 'situsfullstreetaddress', 'situscity',
'ownername1full', 'mailingfullstreetaddress', 'yearbuilt',
'storiesnbrcode', 'currentsalerecordingdate', 'currentsalesprice',
'prevsalerecordingdate', 'prevsalesprice']
assessor_clean = clean_assessor_record(assessor_df, city_name)
assessor_clean = assessor_clean[selected_cols]
#Assessor applying filtering functions
assessor_address = invested_properties(assessor_clean)
assessor_address = assessor_address[selected_cols]
assessor_address = assessor_address.drop_duplicates(keep = "first")
#apply on off market function
df = On_Off_Market(city_name, mls, assessor_address, american_list)
#--------------------------------------------------------
# Count On
on_df = df[2][["sold_on", "status"]].groupby(["sold_on"]).size().reset_index(name ='On')
# Count Off
off_df = df[4][["solddate", "propertyid"]].groupby(["solddate"]).size().reset_index(name ='Off')
on_off_df = off_df.merge(on_df, left_on ="solddate", right_on = "sold_on", how = "outer")
#filter a specific year
on_off_year = on_off_df[(on_off_df.sold_on >= my1)&(on_off_df.sold_on <= my2)]
#plot
fig1 = go.Figure(data=[
go.Bar(name='On', x=on_off_year.solddate, y=on_off_year.On,
text = on_off_year.On,
marker_color = '#3283FE',
texttemplate= '%{text:.2s}'),
go.Bar(name='Off', x=on_off_year.solddate, y=on_off_year.Off,
text = on_off_year.Off,
marker_color = '#EF553B',
texttemplate= '%{text:.2s}')
])
fig1.update_layout(title= f' Total Sold Invested Properties in %s.'%city_name.capitalize(),
title_x=0.5,
legend_title="Market", yaxis_title='#of Sold Properties',
barmode='stack',
height=400, width=800,
yaxis_range=[0,1000])
fig1.show()
return None
def show_flip_properties(city_name, mls, assessor_df, my1, my2):
"""
INPUT:
- city_name: name of a city
- mls: MLS dataset name
- df: On_OFF_market() function
- my1, my2: a year-month you want to see the analysis, , ex: "2021-01", "2021-12"
OUTPUT:
- fig2.show(): detail the number of On Vs. Off flipped deals
"""
selected_cols = ['propertyid', 'situsfullstreetaddress', 'situscity',
'ownername1full', 'mailingfullstreetaddress', 'yearbuilt',
'storiesnbrcode', 'currentsalerecordingdate', 'currentsalesprice',
'prevsalerecordingdate', 'prevsalesprice'] #choose only columns needed
#apply clean function
assessor_clean = clean_assessor_record(assessor_df, city_name)
assessor_clean = assessor_clean[selected_cols]
#Assessor applying filtering functions
assessor_address = invested_properties(assessor_clean) #is invested properties
assessor_address = assessor_address[selected_cols]
assessor_address = assessor_address.drop_duplicates(keep = "first")
flip = identify_flip(assessor_address) #break down by flip
df = On_Off_Market(city_name, mls, flip, american_list)
# Count On
on_df = df[2][["sold_on", "status"]].groupby(["sold_on"]).size().reset_index(name ='On')
#Off
off_df = df[4][["solddate", "propertyid"]].groupby(["solddate"]).size().reset_index(name ='Off')
on_off_df = off_df.merge(on_df, left_on ="solddate", right_on = "sold_on", how = "outer")
#filter a specific year
on_off_year = on_off_df[(on_off_df.sold_on >= my1)&(on_off_df.sold_on <= my2)]
# Specific Year #on_off_year -------------------------------------------------
fig2 = go.Figure(data=[
go.Bar(name='On', x=on_off_year.solddate, y=on_off_year.On,
text = on_off_year.On,
marker_color = '#3283FE',
texttemplate= '%{text:.2s}'),
go.Bar(name='Off', x=on_off_year.solddate, y=on_off_year.Off,
text = on_off_year.Off,
marker_color = '#EF553B',
texttemplate= '%{text:.2s}')
])
fig2.update_layout(title= f' Total Sold Flip Properties in %s.'%city_name.capitalize(),
title_x=0.5,
legend_title="Market", yaxis_title='#of Sold Properties',
barmode='stack',
height=400, width=800,
yaxis_range=[0,150])
fig2.show()
return None
show_invested_properties('chicago', chicago_list, assessor_records, "2021-01", "2021-12")
show_flip_properties('chicago', chicago_list, assessor_records, "2021-01", "2021-12")
show_invested_properties('dallas', dallas_list, assessor_records,"2021-01", "2021-12")
show_flip_properties('dallas', dallas_list, assessor_records,"2021-01", "2021-12")
show_invested_properties('denver', denver_list, assessor_records, "2021-01", "2021-12")
show_flip_properties('denver', denver_list, assessor_records, "2021-01", "2021-12")